title: ā€œMixture Modelsā€ author: ā€œAbdul-Rahmanā€ date: ā€œ10/10/2019ā€ output: pdf_document: default html_document: default


library("ggplot2")
library("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library("mixtools")
## mixtools package, version 1.1.0, Released 2017-03-10
## This package is based upon work supported by the National Science Foundation under Grant No. SES-0518772.
library("mosaics")
## Loading required package: Rcpp
library("mosaicsExample")
library("HistData")
library("bootstrap")
library("ggbeeswarm")
library("RColorBrewer")  
library("ggthemes")  
library("magrittr")  
library("plotly")  
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:mosaics':
## 
##     export
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library("pheatmap")  
library("colorspace")  
library("grid")  
## 
## Attaching package: 'grid'
## The following object is masked from 'package:mixtools':
## 
##     depth
library("ggbio") 
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter,
##     Find, get, grep, grepl, intersect, is.unsorted, lapply, Map,
##     mapply, match, mget, order, paste, pmax, pmax.int, pmin,
##     pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
##     setdiff, sort, table, tapply, union, unique, unsplit, which,
##     which.max, which.min
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## Need specific help about ggbio? try mailing 
##  the maintainer or visit http://tengfei.github.com/ggbio/
## 
## Attaching package: 'ggbio'
## The following objects are masked from 'package:ggplot2':
## 
##     geom_bar, geom_rect, geom_segment, ggsave, stat_bin,
##     stat_identity, xlim
library("GenomicRanges")
## Loading required package: stats4
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:plotly':
## 
##     rename
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:plotly':
## 
##     slice
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## Loading required package: GenomeInfoDb
library("flexmix")
## Loading required package: lattice
library("mclust")
## Package 'mclust' version 5.4.5
## Type 'citation("mclust")' for citing this R package in publications.
## 
## Attaching package: 'mclust'
## The following object is masked from 'package:bootstrap':
## 
##     diabetes
## The following object is masked from 'package:mixtools':
## 
##     dmvnorm
library("mixtools")
library("mosaics")
library("mosaicsExample")
library("here")
## here() starts at /Users/abdul-rahmanbukari/Documents/BIOSTATS
library("HistData")
library("bootstrap")
library("vcd")
## 
## Attaching package: 'vcd'
## The following object is masked from 'package:GenomicRanges':
## 
##     tile
## The following object is masked from 'package:IRanges':
## 
##     tile

Finite Mixtures

Generate a random number from a normal distribution with mean 3 and variance 0.25

set.seed(1234)
coinflips = (runif(10000) > 0.5)
table(coinflips)
## coinflips
## FALSE  TRUE 
##  4990  5010
oneFlip = function(fl, mean1 = 1, mean2 = 3, sd1 = 0.5, sd2 = 0.5) {
  if (fl) {
   rnorm(1, mean1, sd1)
  } else {
   rnorm(1, mean2, sd2)
  }
}
fairmix = vapply(coinflips, oneFlip, numeric(1))
ggplot(tibble(value = fairmix), aes(x = value)) +
     geom_histogram(fill = "purple", binwidth = 0.1)

We can use R’s vectorized syntax to remove the vapply-loop and generate the fairmix vector more efficiently.

means = c(1, 3)
sds   = c(0.25, 0.25)
values = rnorm(length(coinflips),
          mean = ifelse(coinflips, means[1], means[2]),
          sd   = ifelse(coinflips, sds[1],   sds[2]))
ggplot(tibble(value = values), aes(x = value)) +
     geom_histogram(fill = "purple", binwidth = 0.1)

What if we make one million coin flips and make a histogram with 500 bins

fair = tibble(
  coinflips = (runif(1e6) > 0.5),
  values = rnorm(length(coinflips),
               mean = ifelse(coinflips, means[1], means[2]),
               sd   = ifelse(coinflips, sds[1],   sds[2])))
ggplot(fair, aes(x = values)) +
     geom_histogram(fill = "purple", bins = 500)

The histogram gets nearer to a smooth curve called the density function which offers a better approximation of the number of mixtures.

Compare the values of fair$values for which coinflips is TRUE to density function for a normal random variable.

ggplot(dplyr::filter(fair, coinflips), aes(x = values)) +
   geom_histogram(aes(y = ..density..), fill = "purple",
                  binwidth = 0.01) +
   stat_function(fun = dnorm,
          args = list(mean = means[1], sd = sds[1]), color = "red")

Our distributions so far can be matematically represented as the sum of the density distributions

We can use this concept to generate a graph for two component mixture model with extremely visible distributions (using our previous mean and standard deviations) and minimal overlap.

fairtheory = tibble(
  x = seq(-1, 5, length.out = 1000),
  f = 0.5 * dnorm(x, mean = means[1], sd = sds[1]) +
      0.5 * dnorm(x, mean = means[2], sd = sds[2]))
ggplot(fairtheory, aes(x = x, y = f)) +
  geom_line(color = "red", size = 1.5) + ylab("mixture density")

What if we had a figure generated by the code chunk below (means= 1 and 2; std= sqrt(.5) and sqrt(.5) ) without afore knowledge of the code. That is we form a hard-to -esolve distribution as the means of the individual distributions gets closer and the variance increases.

set.seed(1234)
mystery = tibble(
  coinflips = (runif(1e3) > 0.5),
  values = rnorm(length(coinflips),
               mean = ifelse(coinflips, 1, 2),
               sd   = ifelse(coinflips, sqrt(.5), sqrt(.5))))
br2 = with(mystery, seq(min(values), max(values), length.out = 30))
ggplot(mystery, aes(x = values)) +
geom_histogram(fill = "purple", breaks = br2)

head(mystery, 3)
## # A tibble: 3 x 2
##   coinflips values
##   <lgl>      <dbl>
## 1 FALSE      2.70 
## 2 TRUE       0.134
## 3 TRUE       1.50
br = with(mystery, seq(min(values), max(values), length.out = 30))
ggplot(mystery, aes(x = values)) +
  geom_histogram(data = dplyr::filter(mystery, coinflips),
     fill = "red", alpha = 0.2, breaks = br) +
  geom_histogram(data = dplyr::filter(mystery, !coinflips),
     fill = "darkblue", alpha = 0.2, breaks = br) 

#This code separtes the distributions and treats overlaps as belonging to separate groups. This is evident from the heights of the bars.

We can now plot all the ā€œmysteryā€ graphs in layers over each other

ggplot(mystery, aes(x = values, fill = coinflips)) +
  geom_histogram(data = dplyr::filter(mystery, coinflips),
     fill = "red", alpha = 0.2, breaks = br) +
  geom_histogram(data = dplyr::filter(mystery, !coinflips),
     fill = "darkblue", alpha = 0.2, breaks = br) +
  geom_histogram(fill = "purple", breaks = br, alpha = 0.2)

The Expectation-Maximization (EM) Algorithm

The EM algorithm is used to infer the value of hidden groupings. It does so by; alternating between:
- Pretending to know the probability that each observation belongs to a component (and estimating the distribution paramaters of these components)
- Pretending to know the distribution parameters of each component (and estimating the probability that each observation belongs to a given component)

probHead = c(0.125, 0.25)
for (pi in c(1/8, 1/4)) {
  whCoin = sample(2, 100, replace = TRUE, prob = c(pi, 1-pi))
  K = rbinom(length(whCoin), size = 2, prob = probHead[whCoin])
  print(table(K))
}
## K
##  0  1  2 
## 62 34  4 
## K
##  0  1  2 
## 67 30  3

For example a variable Y is measured on a series of objects that come from two groups (however we do not know which group each object belongs to). The goal is to estimate the parameters that make the data Y This can be done by;
- Augmenting the data with a latent (unobserved/hidden) variable, U.this resuls in a bivariate distribution because we are looking at the distribution of ā€œcouplesā€ (Y,U) - Adopting the Maximum Likelihood approach to find the values of U and the unknown parameters of underlying densities.

Scenario 1 Suppose we have two unfair coins, whose probabilities of heads are p1=0.125 and p2=0.25. With probability pi we pick coin 1, with probability = 1āˆ’pi, coin 2. We then toss that coin twice and record the number of heads K. a) Simulate 100 instances of this procedure, with pi=1/8, and compute the contingency table of K. b) Do the same with pi=1/4. c) Can the afore mentioned parameters parameters from the contingency table?

Solution

probHead = c(0.125, 0.25)
set.seed(1234)
for (pi in c(1/8, 1/4)) {
  whCoin = sample(2, 100, replace = TRUE, prob = c(pi, 1-pi))
  K = rbinom(length(whCoin), size = 2, prob = probHead[whCoin])
  print(table(K))
}
## K
##  0  1  2 
## 55 33 12 
## K
##  0  1  2 
## 60 29 11

The parameter are difficult to estimate due to the numerous degrees of freedom.

Scenario 2 Estimating the parameters of a mixture of two normals with mean parameters unkown and standard deviation equal to 1.

# generate  model with R using the labels "u"
#randomly sample 1 or 2, 100 times, with replacement
mus = c(-0.5, 1.5)
set.seed(1234)
u = sample(2, 100, replace = TRUE)
#create a random normal distribution of 100 numbers with mean equal to -0.5(1) or 1.5(2)
y = rnorm(length(u), mean = mus[u])
duy = tibble(u, y)
head(duy)
## # A tibble: 6 x 2
##       u      y
##   <int>  <dbl>
## 1     2 -0.306
## 2     2  0.918
## 3     2  0.391
## 4     2  0.485
## 5     1 -0.662
## 6     2  2.06

Since we know the labels ā€œuā€, we can estimate the means by using seperate ML equations for each group.

duy %>% group_by(u) %>% summarise(mean(y))
## # A tibble: 2 x 2
##       u `mean(y)`
##   <int>     <dbl>
## 1     1    -0.479
## 2     2     1.62
EMtest <- Mclust(duy$y, nclass=2)
EMtest$n
## [1] 100
EMtest$df
## [1] 2
EMtest$BIC
## Bayesian Information Criterion (BIC): 
##           E         V
## 1 -360.6557 -360.6557
## 2 -367.9925 -367.3822
## 3 -371.8851 -382.3664
## 4 -375.8227 -386.7824
## 5 -383.3537 -398.3807
## 6 -388.2105 -403.3627
## 7 -397.4478 -414.0851
## 8 -406.6570 -427.8954
## 9 -415.9232 -436.2490
## 
## Top 3 models based on the BIC criterion: 
##       E,1       V,1       V,2 
## -360.6557 -360.6557 -367.3822
EMtest$classification
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [71] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
head(duy$u,20)
##  [1] 2 2 2 2 1 2 1 1 1 2 2 2 2 1 2 2 2 1 2 2
gm <- normalmixEM(duy$y, k=2, lambda=c(0.5,0.5), mu=c(-0.01,0.01), sigma=c(1,1))
## number of iterations= 563
gm$lambda
## [1] 0.09610953 0.90389047
gm$mu
## [1] -1.6326458  0.9920098
gm$sigma
## [1] 0.3038369 1.2265384
gm$loglik
## [1] -172.1765

Models for zero inflated data

Ecological and molecular data often come in the form of counts. Such as the count data in RNA seq. Such data can often be seen as a mixture of two scenarios: a. If gene is not expressed the count is zero b. It gene is expressed the number of transcripts in different individuals may differ

The R packages pscl and zicounts provide many examples and functions for working with zero inflated counts.

Demonstration with chromatin immunoprecipitation (ChIP) data

constructBins(infile = system.file(file.path("extdata", "wgEncodeSydhTfbsGm12878Stat1StdAlnRep1_chr22_sorted.bam"), package="mosaicsExample"),
    fileFormat = "bam", outfileLoc = here("Book", "data"),
    byChr = FALSE, useChrfile = FALSE, chrfile = NULL, excludeChr = NULL,
    PET = FALSE, fragLen = 200, binSize = 200, capping = 0)
## ------------------------------------------------------------
## Info: setting summary
## ------------------------------------------------------------
## Name of aligned read file: /Library/Frameworks/R.framework/Versions/3.6/Resources/library/mosaicsExample/extdata/wgEncodeSydhTfbsGm12878Stat1StdAlnRep1_chr22_sorted.bam 
## Aligned read file format: BAM 
## Directory of processed bin-level files: /Users/abdul-rahmanbukari/Documents/BIOSTATS/Book/data 
## Construct bin-level files by chromosome? N 
## Is file for chromosome info provided? N 
## Data type: Single-end tag (SET)
## Average fragment length: 200 
## Bin size: 200 
## ------------------------------------------------------------
## Use the provided BAM index file.
## Chromosome information is extracted from the BAM file.
## Info: reading the aligned read file and processing it into bin-level files...
## Info: done!
## ------------------------------------------------------------
## Info: processing summary
## ------------------------------------------------------------
## Directory of processed bin-level files: /Users/abdul-rahmanbukari/Documents/BIOSTATS/Book/data 
## Processed bin-level file: wgEncodeSydhTfbsGm12878Stat1StdAlnRep1_chr22_sorted.bam_fragL200_bin200.txt
## Sequencing depth: 231822 
## ------------------------------------------------------------
constructBins(infile = system.file(file.path("extdata", "wgEncodeSydhTfbsGm12878InputStdAlnRep1_chr22_sorted.bam"), package="mosaicsExample"),
    fileFormat = "bam", outfileLoc = here("Book", "data"),
    byChr = FALSE, useChrfile = FALSE, chrfile = NULL, excludeChr = NULL,
    PET = FALSE, fragLen = 200, binSize = 200, capping = 0)
## ------------------------------------------------------------
## Info: setting summary
## ------------------------------------------------------------
## Name of aligned read file: /Library/Frameworks/R.framework/Versions/3.6/Resources/library/mosaicsExample/extdata/wgEncodeSydhTfbsGm12878InputStdAlnRep1_chr22_sorted.bam 
## Aligned read file format: BAM 
## Directory of processed bin-level files: /Users/abdul-rahmanbukari/Documents/BIOSTATS/Book/data 
## Construct bin-level files by chromosome? N 
## Is file for chromosome info provided? N 
## Data type: Single-end tag (SET)
## Average fragment length: 200 
## Bin size: 200 
## ------------------------------------------------------------
## Use the provided BAM index file.
## Chromosome information is extracted from the BAM file.
## Info: reading the aligned read file and processing it into bin-level files...
## Info: done!
## ------------------------------------------------------------
## Info: processing summary
## ------------------------------------------------------------
## Directory of processed bin-level files: /Users/abdul-rahmanbukari/Documents/BIOSTATS/Book/data 
## Processed bin-level file: wgEncodeSydhTfbsGm12878InputStdAlnRep1_chr22_sorted.bam_fragL200_bin200.txt
## Sequencing depth: 182605 
## ------------------------------------------------------------
datafiles = c(here("Book","data","wgEncodeSydhTfbsGm12878Stat1StdAlnRep1_chr22_sorted.bam_fragL200_bin200.txt"), here("Book", "data","wgEncodeSydhTfbsGm12878InputStdAlnRep1_chr22_sorted.bam_fragL200_bin200.txt"))
binTFBS = readBins(type = c("chip","input"), fileName = datafiles)
## Info: reading and preprocessing bin-level data...
## Info: data contains only one chromosome.
## Info: done!
binTFBS
## Summary: bin-level data (class: BinData)
## ----------------------------------------
## - # of chromosomes in the data: 1
## - total effective tag counts: 462479
##   (sum of ChIP tag counts of all bins)
## - control sample is incorporated
## - mappability score is NOT incorporated
## - GC content score is NOT incorporated
## - uni-reads are assumed
## ----------------------------------------
bincts = print(binTFBS)
ggplot(bincts, aes(x = tagCount)) +
  geom_histogram(binwidth = 1, fill = "forestgreen")

It can be observed that there are many zeros, although from this plot it is not immediately obvious whether the number of zeros is really extraordinary, given the frequencies of the other small numbers Let’s edo the histogram of the counts using a logarithm base 10 scale on the y and estimate Ļ€0, the proportion of bins with zero counts.

ggplot(bincts, aes(x = tagCount)) + scale_y_log10() +
   geom_histogram(binwidth = 1, fill = "forestgreen")
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 17 rows containing missing values (geom_bar).

###Mixtures greater than two Example: What sort of histogram do we get when weighing 7,000 canonical nucleotides (A,T,G,C) each type has a different weight, measured with the same standard deviation sd=3.

masses = c(A =  331, C =  307, G =  347, T =  322)
probs  = c(A = 0.12, C = 0.38, G = 0.36, T = 0.14)
N  = 7000
sd = 3
nuclt   = sample(length(probs), N, replace = TRUE, prob = probs)
quadwts = rnorm(length(nuclt),
                mean = masses[nuclt],
                sd   = sd)
ggplot(tibble(quadwts = quadwts), aes(x = quadwts)) +
  geom_histogram(bins = 100, fill = "purple")

What happens if we repeat this exercise with N=1000 nucleotide measurements?

N  = 1000
sd = 3
set.seed(1234)
nuclt   = sample(length(probs), N, replace = TRUE, prob = probs)
quadwts = rnorm(length(nuclt),
                mean = masses[nuclt],
                sd   = sd)
ggplot(tibble(quadwts = quadwts), aes(x = quadwts)) +
  geom_histogram(bins = 100, fill = "purple")

The middle components have become less distinct seeminly indicating just 3 components. this technically an overlap of the mixtures. Lesson: Increasing the sequence length enhances the dinctinctivness of the components.

What happens if we keep N=7000 measurements but make the standard deviation 10?

N  = 7000
sd = 10
set.seed(1234)
nuclt   = sample(length(probs), N, replace = TRUE, prob = probs)
quadwts = rnorm(length(nuclt),
                mean = masses[nuclt],
                sd   = sd)
ggplot(tibble(quadwts = quadwts), aes(x = quadwts)) +
  geom_histogram(bins = 100, fill = "purple")

Now the individual components are even less distinct. It looks like there are only 2 components within our 4 component data. Because increasing the standard deviation results in mass overlap.

Thus as we have enough measurements with good enough precision, we are able to distinguish the four nucleotides and decompose the distribution.

Empirical distributions and the nonparametric bootstrap

An example is the Darwin’s Zea mays data where he compared heights of 15 self-hybridized and 15 self-crossed plants as example.Here we will consider extreme case of mitxure models, where the number of observatins is equal to the number of mixture components. An example is the Darwin’s Zea mays data where he compared heights of 15 self-hybridized and 15 self-crossed plants as example.

ZeaMays$diff
##  [1]  6.125 -8.375  1.000  2.000  0.750  2.875  3.500  5.125  1.750  3.625
## [11]  7.000  3.000  9.375  7.500 -6.000
ggplot(ZeaMays, aes(x = diff, ymax = 1/15, ymin = 0)) +
  geom_linerange(size = 1, col = "forestgreen") + ylim(0, 0.1)

We can bootstrap to estimate the sampling distribution of the median of the Zea Mays differences.

set.seed(1234)
B = 1000
meds = replicate(B, {
  i = sample(15, 15, replace = TRUE)
  median(ZeaMays$diff[i])
})
ggplot(tibble(medians = meds), aes(x = medians)) +
  geom_histogram(bins = 30, fill = "purple")

We can also estimate a 99% confidence interval for the median based on these simulations. The quantile function can be used for this.

quantile(meds,c(0.005,0.995))
##    0.5%   99.5% 
## 0.99875 6.12500

Use the bootstrap package to redo the same analysis using the function bootstrap for both median and mean. What differences do you notice between the sampling distribution of the mean and the median?

boot_means=bootstrap(ZeaMays$diff, B, mean)
boot_medians=bootstrap(ZeaMays$diff, B, median)
ggplot(tibble(boot_means$thetastar),aes(x = boot_means$thetastar))+
  geom_histogram(bins = 30, fill = "purple")

ggplot(tibble(boot_medians$thetastar),aes(x = boot_medians$thetastar))+
  geom_histogram(bins = 30, fill = "purple")

The bootstrap uses a mixture with n components, so with a sample of size n, it qualifies as a nonparametric method. If the sample is composed of n=3 different values, how many different bootstrap resamples are possible? Answer the same question with n = 15.

c(N3 = choose(5, 3), N15 = choose(29, 15))
##       N3      N15 
##       10 77558760

###Infinite Mixtures

##Infinite mixture of normals

This is when the number of mixture components is greater than or equal to the number of observations.

Let’s simulate an infinite mixture using a 2 level of data generation technique. In level 1, we generate 10000 random deviates (w) from the exponential distribution with rate (ie. mean) equal to 1. In level 2, the deviates (w) serve as the variances of normal variables with mean μ generated using rnorm. We finally visualize the data.

w = rexp(10000, rate = 1)
mu  = 0.3
lps = rnorm(length(w), mean = mu, sd = sqrt(w))
ggplot(data.frame(lps), aes(x = lps)) +
  geom_histogram(fill = "purple", binwidth = 0.1)

This is called a Laplace distribution. It is useful because the median is a good estimator of its location parameter Īø. and that the its scale parameter Ļ• can be estimated from the median absolute deviation.

###Asymmetric Laplace In the Laplace distribution, the variances of the normal components depend on W, while the means are unaffected. Lets generate and visualize data from an asymmetric Laplace distribution.

mu = 0.3; sigma = 0.4; theta = -1
w  = rexp(10000, 1)
alps = rnorm(length(w), theta + mu * w, sigma * sqrt(w))
ggplot(tibble(alps), aes(x = alps)) +
  geom_histogram(fill = "purple", binwidth = 0.1)

Infinite mixtures of poisson variables: the case of gamma-poisson distribution

modelling a real-world count data requires a two-level hierarchical model where simple Poisson and binomial distributions serve as the building blocks at the lower level. Examples of sampling schemes that are well modeled by mixtures of Poisson variables include RNA-Seq, and 16S rRNA-Seq data used in microbial ecology.

###Gamma distribution: two parameters (shape and scale) The gamma distribution is an extension of the (one-parameter) exponential distribution, but it has two parameters, which makes it more flexible. It is often useful as a building block for the upper level of a hierarchical model.

Example gamma distributions

set.seed(1234)
ggplot(tibble(x = rgamma(10000, shape = 2, rate = 1/3)),
   aes(x = x)) + geom_histogram(bins = 100, fill= "purple")

ggplot(tibble(x = rgamma(10000, shape = 10, rate = 3/2)),
   aes(x = x)) + geom_histogram(bins = 100, fill= "purple")

Example gamma-Poisson hierachical model

lambda = rgamma(10000, shape = 10, rate = 3/2)
gp = rpois(length(lambda), lambda = lambda)
ggplot(tibble(x = gp), aes(x = x)) +
  geom_histogram(bins = 100, fill= "purple")

The Gamma-Poisson distribution is a discrete distribution. Gamma-Poisson distribution is also called negative binomial distribution. Formula for equation is similar to binomial distribution Gamma-Poisson is more defining of the underlying principles

ofit = goodfit(gp, "nbinomial")
plot(ofit, xlab = "")

ofit$par
## $size
## [1] 9.970598
## 
## $prob
## [1] 0.5998435

###Variance stabilizing transformations Transformations are important in reducing the heterogeneity and heteroscedasticity of the data. The goal is to stabilze the variance. Instability in the variance is a source of noise. Example of simulated data before and after transformation

set.seed(1234)
lambdas = seq(100, 900, by = 100)
simdat = lapply(lambdas, function(l)
    tibble(y = rpois(n = 40, lambda=l), lambda = l)
  ) %>% bind_rows
#data before transformation
ggplot(simdat, aes(x = lambda, y = y)) +
  geom_beeswarm(alpha = 0.6, color = "purple")

#data after square root transformation
ggplot(simdat, aes(x = lambda, y = sqrt(y))) +
  geom_beeswarm(alpha = 0.6, color = "purple")

.o = options(digits = 3)
summarise(group_by(simdat, lambda), sd(y), sd(2*sqrt(y)))
## # A tibble: 9 x 3
##   lambda `sd(y)` `sd(2 * sqrt(y))`
##    <dbl>   <dbl>             <dbl>
## 1    100    10.1             0.996
## 2    200    15.4             1.06 
## 3    300    15.5             0.903
## 4    400    18.3             0.913
## 5    500    23.6             1.05 
## 6    600    23.6             0.959
## 7    700    25.6             0.965
## 8    800    31.2             1.10 
## 9    900    32.9             1.10
options(.o)

Repeat the computation in the above code chunk for a version of simdat with a larger number of replicates than 40.

set.seed(1234)
lambdas = seq(100, 900, by = 100)
simdat2 = lapply(lambdas, function(l)
    tibble(y = rpois(n = 1000, lambda=l), lambda = l)
  ) %>% bind_rows
#data before transformation
ggplot(simdat2, aes(x = lambda, y = y)) +
  geom_beeswarm(alpha = 0.6, color = "purple")

#data after square root transformation
ggplot(simdat2, aes(x = lambda, y = sqrt(y))) +
  geom_beeswarm(alpha = 0.6, color = "purple")

Now Using the gamma-Poisson distribution and and plotting the 95% confidence intervals around the mean.

muvalues = 2^seq(0, 10, by = 1)
simgp = lapply(muvalues, function(mu) {
  u = rnbinom(n = 1e4, mu = mu, size = 4)
  tibble(mean = mean(u), sd = sd(u),
         lower = quantile(u, 0.025),
         upper = quantile(u, 0.975),
         mu = mu)
  } ) %>% bind_rows
head(as.data.frame(simgp), 2)
##     mean       sd lower upper mu
## 1 1.0043 1.130048     0     4  1
## 2 2.0171 1.711286     0     6  2
ggplot(simgp, aes(x = mu, y = mean, ymin = lower, ymax = upper)) +
  geom_point() + geom_errorbar()

To transform these data to normalize their standard deviation, divide the values that correspond to mu[1] (and which are centered around simgp\(mean[1]) by their standard deviation simgp\)sd[1], the values that correspond to mu[2] (and which are centered around simgp\(mean[2]) by their standard deviation simgp\)sd[2], and so on, then the resulting values will have, by construction, a standard deviation (and thus variance) of 1.

simgp = mutate(simgp,
  slopes = 1 / sd,
  trsf   = cumsum(slopes * mean))
ggplot(simgp, aes(x = mean, y = trsf)) +
  geom_point() + geom_line() + xlab("")

##Exercises

##Excersise 1 The EM algorithm step by step Model the Myst.rds dataset as a mixture of two normal distributions (A and B) with unknown means and standard deviations.

yvar = readRDS(here("Book","data","Myst.rds"))$yvar
ggplot(tibble(yvar), aes(x = yvar)) + geom_histogram(binwidth=0.025)

str(yvar)
##  num [1:1800] 0.3038 0.0596 -0.0204 0.1849 0.2842 ...

Create the Distributions with the code chunck below.

#First, simulate a random uniform distribution with the length of yvar (1800) yielding entry probabilities of the data points of the component
pA = runif(length(yvar))
#Make complementary propabilities of pA
pB = 1 - pA
#housekeeping variables: iter counts over the iterations of the EM algorithm; loglik stores the current log-likelihood; delta stores the change in the log-likelihood from the previous iteration to the current one. We also define the parameters tolerance, miniter and maxiter of the algorithm.

iter = 0
loglik = -Inf
delta = +Inf
tolerance = 1e-3
miniter = 50; maxiter = 1000
while((delta > tolerance) && (iter <= maxiter) || (iter < miniter)) {
  #Provide an initial guess of the expected parameters 
  lambda = mean(pA)
  muA = weighted.mean(yvar, pA)
  muB = weighted.mean(yvar, pB)
  sdA = sqrt(weighted.mean((yvar - muA)^2, pA))
  sdB = sqrt(weighted.mean((yvar - muB)^2, pB))

  phiA = dnorm(yvar, mean = muA, sd = sdA)
  phiB = dnorm(yvar, mean = muB, sd = sdB)
  pA   = lambda * phiA
  pB   = (1 - lambda) * phiB
  ptot = pA + pB
  pA   = pA / ptot
  pB   = pB / ptot

  loglikOld = loglik
  loglik = sum(log(pA))
  delta = abs(loglikOld - loglik)
  iter = iter + 1
}
param = tibble(group = c("A","B"), mean = c(muA,muB), sd = c(sdA,sdB))
param
## # A tibble: 2 x 3
##   group   mean     sd
##   <chr>  <dbl>  <dbl>
## 1 A      0.147 0.150 
## 2 B     -0.169 0.0983
iter
## [1] 359
  1. Which lines correspond to the E-step, which to the M-step?

Algorithm for EM is i. Given a set of incomplete data, consider a set of starting parameters. ii. Expectation step (E – step): Using the observed available data of the dataset, estimate (guess) the values of the missing data.This is represented in the code as; lambda = mean(pA) >muA = weighted.mean(yvar, pA) >muB = weighted.mean(yvar, pB) >sdA = sqrt(weighted.mean((yvar - muA)^2, pA)) >sdB = sqrt(weighted.mean((yvar - muB)^2, pB))

  1. Maximization step (M – step): Complete data generated after the expectation (E) step is used in order to update the parameters. This is represented in the code as; > phiA = dnorm(yvar, mean = muA, sd = sdA) > phiB = dnorm(yvar, mean = muB, sd = sdB) > pA = lambda * phiA > pB = (1 - lambda) * phiB > ptot = pA + pB >
    > pA = pA / ptot > pB = pB / ptot
  1. What does the M-step do, what does the E-step do? E-step: Using the observed available data of the dataset, estimate (guess) the values of the missing data

Maximization step (M – step): Complete data generated after the expectation (E) step is used in order to update the parameters.

  1. What is the role of the algorithm arguments tolerance, miniter and maxiter? Tolerance argument the minimum difference between estimated log-likelihood. A diiferent <= this indicaes vonvergence and termination of iteration. Miniter is the least number of times that we can interate between the E and M steps. Maxiter is the largest threashold for iteration between the E and M steps.

d.Why do we need to compute loglik? loglik estimation is relevant for determining convergence. It is used in estimating delta which is in turn compared with the predefined tolerance argument for assessing convergence.

  1. Compare the result to the output of the normalmixEM function from the mixtools package.
mixtoolsQ <- normalmixEM(yvar, k=2,lambda=c(0.5,0.5), mu=c(muA,muB), sigma=c(sdA,sdB),  maxit=1000)
## number of iterations= 149
mixtoolsQ$mu
## [1]  0.1473373 -0.1693509
mixtoolsQ$sigma
## [1] 0.14977994 0.09826952
mixtoolsQ$loglik
## [1] 417.2218

The results are similar. However the normalmixEM function acheives the optimal loglik at a constantly lower iteration (149).

##Excersise 2 Compare the theoretical values of the gamma-Poisson distribution with parameters given by the estimates in ofit$par in Section 4.4.3 to the data used for the estimation using a QQ-plot.

#Obtain quantiles of the two heirachy simulated data (gp). But first use ppoints() to find the ordinates for probability ploting
ordinates= ppoints(1000)
quant_raw=quantile(gp,ordinates)

#Generate theoretical values of the gamma-Poisson distributions (using parameters of ofit$par) and obtain their quantiles  with the same ordinates

gamma_pois = qnbinom(p=ordinates,size=ofit$par[[1]], prob=ofit$par[[2]])
quant_gamma_pois=quantile(gamma_pois,ordinates)

# plot the two for comparison
plot(quant_raw,quant_gamma_pois)
abline(a =0, b=1, col="red")

There is a high similarity among both datasets.

##Excersice 3

  1. First, plot the data and try to guess how the points were generated.
#load data, view and plot plot NPreg data

data("NPreg")
head(NPreg)
##          x        yn yp yb class id1 id2
## 1 4.176633 22.380379  4  0     1   1   1
## 2 1.201631  5.111575  3  0     1   1   1
## 3 2.295006 13.251058  9  0     1   2   1
## 4 5.965868 30.285240  3  1     1   2   1
## 5 2.358083 14.764508  4  0     1   3   2
## 6 7.637061 42.833760  1  1     1   3   2
summary(NPreg)
##        x                 yn               yp               yb       
##  Min.   :0.06295   Min.   :-1.482   Min.   : 0.000   Min.   :0.000  
##  1st Qu.:2.67892   1st Qu.:20.489   1st Qu.: 2.000   1st Qu.:0.000  
##  Median :5.07820   Median :30.933   Median : 4.000   Median :0.000  
##  Mean   :5.03078   Mean   :28.607   Mean   : 3.955   Mean   :0.475  
##  3rd Qu.:7.37686   3rd Qu.:38.322   3rd Qu.: 5.000   3rd Qu.:1.000  
##  Max.   :9.89474   Max.   :53.487   Max.   :12.000   Max.   :1.000  
##                                                                     
##      class          id1           id2     
##  Min.   :1.0   1      :  2   1      :  4  
##  1st Qu.:1.0   2      :  2   2      :  4  
##  Median :1.5   3      :  2   3      :  4  
##  Mean   :1.5   4      :  2   4      :  4  
##  3rd Qu.:2.0   5      :  2   5      :  4  
##  Max.   :2.0   6      :  2   6      :  4  
##                (Other):188   (Other):176
#There is an independent varianble (x) and three dependent variables following the Normal(yn), Poisson(yp) and Binomial(yb) Regression and two  classes.

ggplot(NPreg, aes(x = x, y = yn, color=as.factor(class))) + geom_point()

ggplot(NPreg, aes(x = x, y = yp, color=as.factor(class))) + geom_point()

ggplot(NPreg, aes(x = x, y = yb, color=as.factor(class))) + geom_point()

Of all the plots, the normal regression (yn) is easily distiguishable by class. Class 1 seems to follow a linear regression while class two follows a non-linear (possibly quadratic) regression.

  1. Fit a two component mixture model using the commands;
m1 = flexmix(yn ~ x + I(x^2), data = NPreg, k = 2)

#this is a two component model. We can easily observe the cluster sizes, convergence iterations by running the object. More info can be obtained by summary
m1
## 
## Call:
## flexmix(formula = yn ~ x + I(x^2), data = NPreg, k = 2)
## 
## Cluster sizes:
##   1   2 
## 100 100 
## 
## convergence after 15 iterations
summary(m1)
## 
## Call:
## flexmix(formula = yn ~ x + I(x^2), data = NPreg, k = 2)
## 
##        prior size post>0 ratio
## Comp.1 0.494  100    145 0.690
## Comp.2 0.506  100    141 0.709
## 
## 'log Lik.' -642.5453 (df=9)
## AIC: 1303.091   BIC: 1332.775
#We can also observe the associated parameter of the componets by running parameters() and selecting the appropraite component
  1. Look at the estimated parameters of the mixture components and make a truth table that cross-classifies true classes versus cluster memberships. What does the summary of the object m1 show us?
parameters(m1, component = 1)
##                       Comp.1
## coef.(Intercept) -0.20959655
## coef.x            4.81774867
## coef.I(x^2)       0.03616097
## sigma             3.47652037
parameters(m1, component = 2)
##                      Comp.2
## coef.(Intercept) 14.7171140
## coef.x            9.8466333
## coef.I(x^2)      -0.9683531
## sigma             3.4796257
# we can now create a contingency table using the class and clusters
table(NPreg$class, clusters(m1))
##    
##      1  2
##   1 95  5
##   2  5 95
summary(m1)
## 
## Call:
## flexmix(formula = yn ~ x + I(x^2), data = NPreg, k = 2)
## 
##        prior size post>0 ratio
## Comp.1 0.494  100    145 0.690
## Comp.2 0.506  100    141 0.709
## 
## 'log Lik.' -642.5453 (df=9)
## AIC: 1303.091   BIC: 1332.775

The summary provides information on the log Lik (-642.5453) degreed of freedom (df=9) AIC: 1303.091 and BIC: 1332.776. The prior and size poterior of both classes are also indicated.

  1. Plot the data again, this time coloring each point according to its estimated class.
ggplot(NPreg, aes(x = x, y = yn, group = class)) +
   geom_point(aes(colour = as.factor(m1@cluster), shape = factor(class)))

Exercise 4

Other hierarchical noise models: Find two papers that explore the use of other infite mixtures for modeling molecular biology technological variation.

  1. Yau, C., Papaspiliopoulos, O., Roberts, G.O. and Holmes, C., 2011. Bayesian non‐parametric hidden Markov models with applications in genomics. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 73(1), pp.37-57.

  2. Dang, T. and Kishino, H., 2019. Stochastic variational inference for Bayesian phylogenetics: A case of CAT model. Molecular biology and evolution, 36(4), pp.825-833.